Let us pull the dataset from the file
spotify_dataset.csv into a dataframe
df_spotify. As this file is huge in size, we cannot upload
it on GitHub. Thus, we are storing the file in a folder called
spotify_dataset that is located in one folder above the
project folder.
df_spotify <- data.frame(read.csv('spotify_dataset.csv'))
str(df_spotify)
## 'data.frame': 114000 obs. of 21 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ track_id : chr "5SuOikwiRyPMVoIQDJUgSV" "4qPNDBW1i3p13qLCt0Ki3A" "1iJBSr7s7jYXzM8EGcbK5b" "6lfxq3CG4xtTiEg7opyCyx" ...
## $ artists : chr "Gen Hoshino" "Ben Woodward" "Ingrid Michaelson;ZAYN" "Kina Grannis" ...
## $ album_name : chr "Comedy" "Ghost (Acoustic)" "To Begin Again" "Crazy Rich Asians (Original Motion Picture Soundtrack)" ...
## $ track_name : chr "Comedy" "Ghost - Acoustic" "To Begin Again" "Can't Help Falling In Love" ...
## $ popularity : int 73 55 57 71 82 58 74 80 74 56 ...
## $ duration_ms : int 230666 149610 210826 201933 198853 214240 229400 242946 189613 205594 ...
## $ explicit : chr "False" "False" "False" "False" ...
## $ danceability : num 0.676 0.42 0.438 0.266 0.618 0.688 0.407 0.703 0.625 0.442 ...
## $ energy : num 0.461 0.166 0.359 0.0596 0.443 0.481 0.147 0.444 0.414 0.632 ...
## $ key : int 1 1 0 0 2 6 2 11 0 1 ...
## $ loudness : num -6.75 -17.23 -9.73 -18.52 -9.68 ...
## $ mode : int 0 1 1 1 1 1 1 1 1 1 ...
## $ speechiness : num 0.143 0.0763 0.0557 0.0363 0.0526 0.105 0.0355 0.0417 0.0369 0.0295 ...
## $ acousticness : num 0.0322 0.924 0.21 0.905 0.469 0.289 0.857 0.559 0.294 0.426 ...
## $ instrumentalness: num 1.01e-06 5.56e-06 0.00 7.07e-05 0.00 0.00 2.89e-06 0.00 0.00 4.19e-03 ...
## $ liveness : num 0.358 0.101 0.117 0.132 0.0829 0.189 0.0913 0.0973 0.151 0.0735 ...
## $ valence : num 0.715 0.267 0.12 0.143 0.167 0.666 0.0765 0.712 0.669 0.196 ...
## $ tempo : num 87.9 77.5 76.3 181.7 119.9 ...
## $ time_signature : int 4 4 4 3 4 4 3 4 4 4 ...
## $ track_genre : chr "acoustic" "acoustic" "acoustic" "acoustic" ...
head(df_spotify)
## X track_id artists
## 1 0 5SuOikwiRyPMVoIQDJUgSV Gen Hoshino
## 2 1 4qPNDBW1i3p13qLCt0Ki3A Ben Woodward
## 3 2 1iJBSr7s7jYXzM8EGcbK5b Ingrid Michaelson;ZAYN
## 4 3 6lfxq3CG4xtTiEg7opyCyx Kina Grannis
## 5 4 5vjLSffimiIP26QG5WcN2K Chord Overstreet
## 6 5 01MVOl9KtVTNfFiBU9I7dc Tyrone Wells
## album_name
## 1 Comedy
## 2 Ghost (Acoustic)
## 3 To Begin Again
## 4 Crazy Rich Asians (Original Motion Picture Soundtrack)
## 5 Hold On
## 6 Days I Will Remember
## track_name popularity duration_ms explicit danceability
## 1 Comedy 73 230666 False 0.676
## 2 Ghost - Acoustic 55 149610 False 0.420
## 3 To Begin Again 57 210826 False 0.438
## 4 Can't Help Falling In Love 71 201933 False 0.266
## 5 Hold On 82 198853 False 0.618
## 6 Days I Will Remember 58 214240 False 0.688
## energy key loudness mode speechiness acousticness instrumentalness liveness
## 1 0.4610 1 -6.746 0 0.1430 0.0322 1.01e-06 0.3580
## 2 0.1660 1 -17.235 1 0.0763 0.9240 5.56e-06 0.1010
## 3 0.3590 0 -9.734 1 0.0557 0.2100 0.00e+00 0.1170
## 4 0.0596 0 -18.515 1 0.0363 0.9050 7.07e-05 0.1320
## 5 0.4430 2 -9.681 1 0.0526 0.4690 0.00e+00 0.0829
## 6 0.4810 6 -8.807 1 0.1050 0.2890 0.00e+00 0.1890
## valence tempo time_signature track_genre
## 1 0.715 87.917 4 acoustic
## 2 0.267 77.489 4 acoustic
## 3 0.120 76.332 4 acoustic
## 4 0.143 181.740 3 acoustic
## 5 0.167 119.949 4 acoustic
## 6 0.666 98.017 4 acoustic
tail(df_spotify)
## X track_id artists
## 113995 113994 4WbOUe6T0sozC7z5ZJgiAA Lucas Cervetti
## 113996 113995 2C3TZjDRiAzdyViavDJ217 Rainy Lullaby
## 113997 113996 1hIz5L4IB9hN3WRYPOCGPw Rainy Lullaby
## 113998 113997 6x8ZfSoqDjuNa5SVP5QjvX Cesária Evora
## 113999 113998 2e6sXL2bYv4bSz6VTdnfLs Michael W. Smith
## 114000 113999 2hETkH7cOfqmz3LqZDHZf5 Cesária Evora
## album_name
## 113995 Frecuencias Álmicas en 432hz
## 113996 #mindfulness - Soft Rain for Mindful Meditation, Stress Relief Relaxation Music
## 113997 #mindfulness - Soft Rain for Mindful Meditation, Stress Relief Relaxation Music
## 113998 Best Of
## 113999 Change Your World
## 114000 Miss Perfumado
## track_name popularity duration_ms explicit danceability
## 113995 Frecuencia Álmica, Pt. 4 22 305454 False 0.331
## 113996 Sleep My Little Boy 21 384999 False 0.172
## 113997 Water Into Light 22 385000 False 0.174
## 113998 Miss Perfumado 22 271466 False 0.629
## 113999 Friends 41 283893 False 0.587
## 114000 Barbincor 22 241826 False 0.526
## energy key loudness mode speechiness acousticness instrumentalness
## 113995 0.171 1 -15.668 1 0.0350 0.920 0.0229
## 113996 0.235 5 -16.393 1 0.0422 0.640 0.9280
## 113997 0.117 0 -18.318 0 0.0401 0.994 0.9760
## 113998 0.329 0 -10.895 0 0.0420 0.867 0.0000
## 113999 0.506 7 -10.889 1 0.0297 0.381 0.0000
## 114000 0.487 1 -10.204 0 0.0725 0.681 0.0000
## liveness valence tempo time_signature track_genre
## 113995 0.0679 0.3270 132.147 3 world-music
## 113996 0.0863 0.0339 125.995 5 world-music
## 113997 0.1050 0.0350 85.239 4 world-music
## 113998 0.0839 0.7430 132.378 4 world-music
## 113999 0.2700 0.4130 135.960 4 world-music
## 114000 0.0893 0.7080 79.198 4 world-music
summary(df_spotify)
## X track_id artists album_name
## Min. : 0 Length:114000 Length:114000 Length:114000
## 1st Qu.: 28500 Class :character Class :character Class :character
## Median : 57000 Mode :character Mode :character Mode :character
## Mean : 57000
## 3rd Qu.: 85499
## Max. :113999
## track_name popularity duration_ms explicit
## Length:114000 Min. : 0.00 Min. : 0 Length:114000
## Class :character 1st Qu.: 17.00 1st Qu.: 174066 Class :character
## Mode :character Median : 35.00 Median : 212906 Mode :character
## Mean : 33.24 Mean : 228029
## 3rd Qu.: 50.00 3rd Qu.: 261506
## Max. :100.00 Max. :5237295
## danceability energy key loudness
## Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. :-49.531
## 1st Qu.:0.4560 1st Qu.:0.4720 1st Qu.: 2.000 1st Qu.:-10.013
## Median :0.5800 Median :0.6850 Median : 5.000 Median : -7.004
## Mean :0.5668 Mean :0.6414 Mean : 5.309 Mean : -8.259
## 3rd Qu.:0.6950 3rd Qu.:0.8540 3rd Qu.: 8.000 3rd Qu.: -5.003
## Max. :0.9850 Max. :1.0000 Max. :11.000 Max. : 4.532
## mode speechiness acousticness instrumentalness
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00e+00
## 1st Qu.:0.0000 1st Qu.:0.03590 1st Qu.:0.0169 1st Qu.:0.00e+00
## Median :1.0000 Median :0.04890 Median :0.1690 Median :4.16e-05
## Mean :0.6376 Mean :0.08465 Mean :0.3149 Mean :1.56e-01
## 3rd Qu.:1.0000 3rd Qu.:0.08450 3rd Qu.:0.5980 3rd Qu.:4.90e-02
## Max. :1.0000 Max. :0.96500 Max. :0.9960 Max. :1.00e+00
## liveness valence tempo time_signature
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. :0.000
## 1st Qu.:0.0980 1st Qu.:0.2600 1st Qu.: 99.22 1st Qu.:4.000
## Median :0.1320 Median :0.4640 Median :122.02 Median :4.000
## Mean :0.2136 Mean :0.4741 Mean :122.15 Mean :3.904
## 3rd Qu.:0.2730 3rd Qu.:0.6830 3rd Qu.:140.07 3rd Qu.:4.000
## Max. :1.0000 Max. :0.9950 Max. :243.37 Max. :5.000
## track_genre
## Length:114000
## Class :character
## Mode :character
##
##
##
First, let’s check the percentage of NA’s present in each columns of the dataset.
(colMeans(is.na(df_spotify)))*100
## X track_id artists album_name
## 0 0 0 0
## track_name popularity duration_ms explicit
## 0 0 0 0
## danceability energy key loudness
## 0 0 0 0
## mode speechiness acousticness instrumentalness
## 0 0 0 0
## liveness valence tempo time_signature
## 0 0 0 0
## track_genre
## 0
data_corr<-df_spotify %>% select_if(is.numeric)
data_corr = subset(data_corr, select = -c(X) )
corrplot.mixed(cor(data_corr))
ggplot(df_spotify, aes(x = popularity, fill=popularity)) +
geom_bar()
ggplot(df_spotify, aes(x = danceability, fill=danceability)) +
geom_bar()
acHist <- ggplot(df_spotify, aes(x=acousticness)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
ggtitle("Histogram of Acousticness")
acHist
# scatter plot for acousticness & popularity
acSctr<- ggplot(df_spotify, aes(x=popularity, y= acousticness, color="#e9ecef")) +
geom_point(size = 0.1) +
ggtitle("Scatterplot for Acousticness")
acSctr
# thought this violin plot will be better with other variables in violin plots
acV <- ggplot(df_spotify, aes(x=acousticness, y=popularity,fill="#69b3a2")) +
geom_violin() +
ggtitle("Violinplot for Acousticness")
acV
# made basic histogram & scatterplot for every other variables that we didn't include to our SMART Q
speechinessHist <- ggplot(df_spotify, aes(x=speechiness)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
ggtitle("Histogram of speechiness")
speechinessHist
speechinessSct<- ggplot(df_spotify, aes(x=popularity, y= speechiness, color="#e9ecef")) +
geom_point(size = 0.1) +
ggtitle("Scatterplot for speechiness")
speechinessSct
instrumentalnessHist <- ggplot(df_spotify, aes(x=instrumentalness)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
ggtitle("Histogram of instrumentalness")
instrumentalnessHist
instrumentalnessSctr<- ggplot(df_spotify, aes(x=popularity, y= instrumentalness, color="#e9ecef")) +
geom_point(size = 0.1) +
ggtitle("Scatterplot for instrumentalness")
instrumentalnessSctr
livenessHist <- ggplot(df_spotify, aes(x=liveness)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
ggtitle("Histogram of liveness")
livenessHist
livenessSct<- ggplot(df_spotify, aes(x=popularity, y= liveness, color="#e9ecef")) +
geom_point(size = 0.1) +
ggtitle("Scatterplot for liveness")
livenessSct
valenceHist <- ggplot(df_spotify, aes(x=valence)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
ggtitle("Histogram of instrumentalness")
valenceHist
valenceSctr<- ggplot(df_spotify, aes(x=popularity, y= valence, color="#e9ecef")) +
geom_point(size = 0.1) +
ggtitle("Scatterplot for valence")
valenceSctr
tempoHist <- ggplot(df_spotify, aes(x=tempo)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
ggtitle("Histogram of tempo")
tempoHist
tempoSctr<- ggplot(df_spotify, aes(x=popularity, y= tempo, color="#e9ecef")) +
geom_point(size = 0.1) +
ggtitle("Scatterplot for tempo")
tempoSctr